# -*- coding: utf-8 -*-



import urllib2, time, random, re, threading, string, sys, urllib, urllib2, getopt
import MySQLdb, socks, socket
from bs4 import BeautifulSoup

web_site_count = 13  # 要抓取的网站数目
indebug = 1        

thread_num = 200  # 开 thread_num 个线程检查代理
check_in_one_call = thread_num * 10  # 本次程序运行时 最多检查的代理个数

db_host = 'localhost'  # 数据库设置
db_port = 3306
db_user = 'root'
db_passwd = '123456'
db_database = 'social'
db_charset = 'utf8'

target_url = "http://www.baidu.com/"  # 验证代理的时候通过代理访问这个地址
target_string = "030173"  # 如果返回的html中包含这个字符串，
target_timeout = 10  # 并且响应时间小于 target_timeout 秒 
                                     # 那么我们就认为这个代理是有效的 
                                     
proxy_use = 1  # 抓取时，是否使用代理
proxy_ip = '127.0.0.1'
proxy_port = '8087'

proxy_array = []  # 这个数组保存将要添加到数据库的代理列表 
update_array = []  # 这个数组保存将要更新的代理的数据 

conn = None  # 数据库全局对象
cursor = None

def  usage():
  print u"-h help"
  print u"-g 抓取所有的网站代理"
  print u"-c http-普通http代理 connect-支持connect代理 检查代理"
  print u"-t id  测试抓取指定id的网站代理，不入库"

def get_html(url=''):
    if proxy_use == 1:
      opener = urllib.FancyURLopener({'http': 'http://' + proxy_ip + ':' + proxy_port + '/'})
    else:
      opener = urllib.FancyURLopener({})
    opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.76 Safari/537.36')]
    try:
      f = opener.open(url)
      return f.read()
    except Exception, e:
      print e
      return ''	

def build_list_urls_1(page=10):
	page = page + 1
	ret = []
	for i in range(1, page):
		ret.append('http://www.cnproxy.com/proxy%(num)01d.html' % {'num':i})		
	return ret

def parse_page_1(html=''):
	matches = re.findall('<tr><td>(.*?)<script.*?>document.write\(\":\"\+(.*?)\)</script></td><td>(.*?)</td><td>.*?</td><td>(.*?)</td>', html, re.IGNORECASE)	  
	ret = []
	for match in matches:
		ip = match[0]
		port = match[1]	
		v = "3";m = "4";a = "2";l = "9";q = "0";b = "5";i = "7";w = "6";r = "8";c = "1";	
		portlist = port.split('+')
		porttmp = ''
		for intstr in portlist:
		  porttmp = porttmp + eval(intstr)
		port = porttmp
		method = match[2]
		if method == 'HTTP':
		  method = 1
		elif method == 'SOCKS4':
		  method = 2
		elif method == 'SOCKS5':
		  method = 3
		type = -1  # 该网站未提供代理服务器类型
		area = match[3]
		if indebug:
		  print '1', ip, port, method, type, area
		area = unicode(area, 'cp936')
		area = area.encode('utf8')
		ret.append([ip, port, method, type, area])
	return ret

def build_list_urls_2(page=1):
	return ['http://www.proxylists.net/http_highanon.txt']

def parse_page_2(html=''):
  matches = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\:(\d{2,5})', html)
  ret = []
  for match in matches:
		ip = match[0]
		port = match[1]
		method = 1
		type = 2         
		area = '--'
		ret.append([ip, port, method, type, area])
		if indebug:
		  print '2', ip, port, method, type, area
  return ret


def build_list_urls_3(page=1):
	return ['http://www.proxylists.net/http.txt']

def parse_page_3(html=''):
  matches = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\:(\d{2,5})', html)
  ret = []
  for match in matches:
		ip = match[0]
		port = match[1]
		method = 1
		type = -1         
		area = '--'
		ret.append([ip, port, method, type, area])
		if indebug:
		  print '3', ip, port, method, type, area
  return ret



def build_list_urls_4(page=5):
	page = page + 1
	ret = []
	for i in range(0, page):
		ret.append('http://proxylist.sakura.ne.jp/index.htm?pages=%(n)01d' % {'n':i})		
	return ret

def parse_page_4(html=''):
  matches = re.findall('<TD.*?>[\s\S]*?<script.*?>[\s\S]*?proxy\((.*?)\);[\s\S]*?<\/script>[\s\S]*?<\/TD>[\s\S]*?<TD>(.*?)<\/TD>[\s\S]*?', html, re.IGNORECASE)
  ret = []
  for match in matches:
		ipandport = match[0]
		ipandportlist = ipandport.split(',')
		port = ipandportlist[5]
		if ipandportlist[0] == '1':
		  ip = ipandportlist[1].strip("'") + "." + ipandportlist[2].strip("'") + "." + ipandportlist[3].strip("'") + "." + ipandportlist[4].strip("'")
		elif ipandportlist[0] == '2':
		  ip = ipandportlist[4].strip("'") + "." + ipandportlist[1].strip("'") + "." + ipandportlist[2].strip("'") + "." + ipandportlist[3].strip("'")
		elif ipandportlist[0] == '3':
		  ip = ipandportlist[3].strip("'") + "." + ipandportlist[4].strip("'") + "." + ipandportlist[1].strip("'") + "." + ipandportlist[2].strip("'")
		elif ipandportlist[0] == '4': 
		  ip = ipandportlist[2].strip("'") + "." + ipandportlist[3].strip("'") + "." + ipandportlist[4].strip("'") + "." + ipandportlist[1].strip("'") 
		method = 1
		type = -1         
		area = match[1]
		if (type == 'Anonymous'):
		  type = 1
		else:
		  type = -1
		ret.append([ip, port, method, type, area])
		if indebug:
		  print '4', ip, port, method, type, area
  return ret

def build_list_urls_5(page=10):
	page = page + 1
	ret = []
	for i in range(1, page):
	  ret.append('http://www.my-proxy.com/free-proxy-list-%(n)01d.html' % {'n':i})
	ret.append('http://www.my-proxy.com/free-proxy-list-s1.html')
	ret.append('http://www.my-proxy.com/free-proxy-list-s2.html')
	ret.append('http://www.my-proxy.com/free-proxy-list-s3.html')
	return ret

def parse_page_5(html=''):
  matches = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\:(\d{2,5})', html)
  ret = []
  method = 1
  type = -1
  for match in matches:
		ip = match[0]
		port = match[1]
		area = '--'        
		ret.append([ip, port, method, type, area])
		if indebug:
		  print '5', ip, port, method, type, area
  return ret

def build_list_urls_6(page=4):
  ret = []
  ret.append('http://www.cybersyndrome.net/plr5.html')
  ret.append('http://www.cybersyndrome.net/pla5.html')
  ret.append('http://www.cybersyndrome.net/pld5.html')
  return ret

def parse_page_6(html=''):
  matches = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\:(\d{2,5})', html)
  ret = []
  for match in matches:
		ip = match[0]
		port = match[1]
		method = 1
		area = '--'
		type = -1
		ret.append([ip, port, method, type, area])
		if indebug:
		  print '6', ip, port, method, type, area
  return ret



def build_list_urls_7(page=3):
    ret = []
    ret.append('http://cn-proxy.com/')
    return ret

def parse_page_7(html=''):
    ret = []
    matches = re.findall('<tr>[\s\S]*?<td>(.*?)</td>[\s\S]*?<td>(.*?)</td>[\s\S]*?<td>(.*?)</td>[\s\S]*?</tr>', html, re.IGNORECASE)	
    for match in matches:
      ip = match[0]
      port = match[1]
      method = 1
      type = -1
      area = match[2]
      ipmatch = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})', ip)
      if len(ipmatch) > 0:
        ret.append([ip, port, method, type, area])
        if indebug:
          print '7', ip, port, method, type, area.decode('utf8')
    return ret
    
def build_list_urls_8(page=3):
    ret = []
    ret.append('http://cn-proxy.com/archives/218')
    return ret

def parse_page_8(html=''):
    ret = []
    matches = re.findall('<tr>[\s\S]*?<td>(.*?)</td>[\s\S]*?<td>(.*?)</td>[\s\S]*?<td>(.*?)</td>[\s\S]*?<td>(.*?)</td>[\s\S]*?</tr>', html, re.IGNORECASE)	
    for match in matches:
      ip = match[0]
      port = match[1]
      method = 1
      type = match[2].decode('utf8')
      if type == u'透明':
        type = 0
      elif type == u'普通匿名':
        type = 1
      elif type == u'高度匿名':
        type = 2
      else:
        type = -1
      area = match[3]
      ipmatch = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})', ip)
      if len(ipmatch) > 0:
        ret.append([ip, port, method, type, area])
        if indebug:
          print '8', ip, port, method, type, area.decode('utf8')
    return ret
    
def build_list_urls_9(page=4):
  ret = []
  rehtml = get_html("http://www.youdaili.cn/Daili/http/")
  soup = BeautifulSoup(rehtml, from_encoding='utf-8')
  urllist = soup.find_all('ul', 'newslist_line')[0].find_all('li')
  for url in urllist:
    ret.append(url.find_all('a')[0]['href'])
  rehtml = get_html("http://www.youdaili.cn/Daili/QQ/")
  soup = BeautifulSoup(rehtml, from_encoding='utf-8')
  urllist = soup.find_all('ul', 'newslist_line')[0].find_all('li')
  for url in urllist:
    ret.append(url.find_all('a')[0]['href'])
  rehtml = get_html("http://www.youdaili.cn/Daili/guonei/")
  soup = BeautifulSoup(rehtml, from_encoding='utf-8')
  urllist = soup.find_all('ul', 'newslist_line')[0].find_all('li')
  for url in urllist:
    ret.append(url.find_all('a')[0]['href'])
  rehtml = get_html("http://www.youdaili.cn/Daili/guowai/")
  soup = BeautifulSoup(rehtml, from_encoding='utf-8')
  urllist = soup.find_all('ul', 'newslist_line')[0].find_all('li')
  for url in urllist:
    ret.append(url.find_all('a')[0]['href'])
  rehtml = get_html("http://www.youdaili.cn/Daili/Socks/")
  soup = BeautifulSoup(rehtml, from_encoding='utf-8')
  urllist = soup.find_all('ul', 'newslist_line')[0].find_all('li')
  for url in urllist:
    ret.append(url.find_all('a')[0]['href'])
  return ret

def parse_page_9(html=''):
  matches = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\:(\d{2,5})\@(.*?)\#(.*?)<br \/>', html)
  ret = []
  for match in matches:
		ip = match[0]
		port = match[1]
		method = match[2]
		if method == 'HTTP':
		  method = 1
		elif method == 'SOCKS4':
		  method = 2
		elif method == 'SOCKS5':
		  method = 3
		else:
		  continue
		area = match[3]
		type = -1
		ret.append([ip, port, method, type, area])
		if indebug:
		  print '9', ip, port, method, type, area.decode('utf8')
  return ret
  
def build_list_urls_10(page=4):
  ret = []
  rehtml = get_html("http://www.itmop.com/proxy/")
  soup = BeautifulSoup(rehtml, from_encoding='utf-8')
  urllist = soup.find_all('dt')
  for url in urllist:
    ret.append(url.find_all('a')[0]['href'])
  return ret

def parse_page_10(html=''):
  matches = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\:(\d{2,5})\@(.*?)\;(.*?)<br \/>', html)
  ret = []
  for match in matches:
		ip = match[0]
		port = match[1]
		method = match[2]
		if method == 'HTTP':
		  method = 1
		elif method == 'SOCKS4':
		  method = 2
		elif method == 'SOCKS5':
		  method = 3
		else:
		  continue
		area = match[3]
		type = -1
		ret.append([ip, port, method, type, area])
		if indebug:
		  print '10', ip, port, method, type, area.decode('utf8')
  return ret
  
def build_list_urls_11(page=4):
  ret = []
  ret.append('http://pachong.org/')
  ret.append('http://pachong.org/anonymous.html')
  ret.append('http://pachong.org/transparent.html')
  ret.append('http://pachong.org/area/short/name/cn.html')
  ret.append('http://pachong.org/area/short/name/br.html')
  ret.append('http://pachong.org/area/short/name/us.html')
  ret.append('http://pachong.org/area/short/name/ve.html')
  ret.append('http://pachong.org/area/short/name/in.html')
  return ret

def parse_page_11(html=''):
  matches = re.findall('<tr.*?data-type=\"(.*?)\">[\s\S]*?<td.*?>.*?</td>[\s\S]*?<td>(.*?)</td>[\s\S]*?<td>(.*?)</td>[\s\S]*?<td>[\s\S]*?<img.*?>[\s\S]*?<a.*?>(.*?)</a>', html, re.IGNORECASE)
  ret = []
  for match in matches:
		ip = match[1]
		port = match[2]
		method = 1
		area = match[3]
		type = match[0]
		if type == 'anonymous':
		  type = 1
		elif type == 'transparent':
		  type = 0
		elif type == 'high':
		  type = 2
		elif type == 'socks4':
		  method = 2
		  type = -1
		elif type == 'socks5':
		  method = 3
		  type = -1
		else:
		  type = -1
		ret.append([ip, port, method, type, area])
		if indebug:
		  print '11', ip, port, method, type, area.decode('utf8')
  return ret

def build_list_urls_12(page=10):
  ret = []
  ret.append('http://www.cz88.net/proxy/index.aspx')
  page = page + 1
  for i in range(2, page):
	  ret.append('http://www.cz88.net/proxy/http_%(n)01d.aspx' % {'n':i})
  return ret

def parse_page_12(html=''):
  matches = re.findall('<tr><td>(.*?)</td><td>(.*?)</td><td>.*?</td><td>.*?</td><td><div.*?>(.*?)</div></td></tr>', html, re.IGNORECASE)
  ret = []
  for match in matches:
		ip = match[0].decode('gbk').encode('utf8')
		port = match[1].decode('gbk').encode('utf8')
		method = 1
		area = match[2].decode('gbk').encode('utf8')
		type = -1
		ret.append([ip, port, method, type, area])
		if indebug:
		  print '12', ip, port, method, type, area.decode('utf8')
  return ret
  
def build_list_urls_13(page=10):
  ret = []
  ret.append('http://www.cz88.net/proxy/socks4.aspx')
  ret.append('http://www.cz88.net/proxy/socks4_2.aspx')
  ret.append('http://www.cz88.net/proxy/socks4_3.aspx')
  ret.append('http://www.cz88.net/proxy/socks5.aspx')
  ret.append('http://www.cz88.net/proxy/socks5_2.aspx')
  return ret

def parse_page_13(html=''):
  matches = re.findall('<tr><td>(.*?)</td><td>(.*?)</td><td>(.*?)</td><td>.*?</td><td><div.*?>(.*?)</div></td></tr>', html, re.IGNORECASE)
  ret = []
  for match in matches:
		ip = match[0].decode('gbk').encode('utf8')
		port = match[1].decode('gbk').encode('utf8')
		method = match[2].decode('gbk').encode('utf8')
		if method.decode('utf8') == u'SOCKS4':
		  method = 2
		elif method.decode('utf8') == u'SOCKS5':
		  method = 3
		else:
		  continue
		area = match[3].decode('gbk').encode('utf8')
		type = -1
		ret.append([ip, port, method, type, area])
		if indebug:
		  print '13', ip, port, method, type, area.decode('utf8')
  return ret
		
			
# 线程类

class TEST(threading.Thread):
    def __init__(self, action, index=None, checklist=None, checkmothed='http'):
        threading.Thread.__init__(self)
        self.index = index
        self.action = action
        self.checklist = checklist
        self.checkmothed = checkmothed

    def run(self):
        if (self.action == 'getproxy'):
            get_proxy_one_website(self.index)
        else:
            check_proxy(self.index, self.checklist, self.checkmothed)


def check_proxy(index, checklist, checkmothed):
    for item in checklist:
        check_one_proxy(checkmothed, item[0], item[1], item[2])
        
def check_one_proxy(checkmothed, ip, port, method):
    global update_array
    global check_in_one_call
    global target_url, target_string, target_timeout
      
    url = target_url
    checkstr = target_string
    timeout = target_timeout
    if checkmothed == 'http':
      if method == 1:
        proxy_handler = urllib2.ProxyHandler({'http': 'http://' + ip + ':' + str(port) + '/'})
        opener = urllib2.build_opener(proxy_handler)
        urllib2.install_opener(opener) 
      else:
        return  # socks4,socks5 退出函数处理     	  
      	  
    elif checkmothed == 'connect':
      if method == 1:
        socks.setdefaultproxy(socks.PROXY_TYPE_HTTP, ip, int(port))
      elif method == 2:
        socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS4, ip, int(port))
      elif method == 3:
        socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, ip, int(port))
      socks.wrap_module(urllib2) 
         
    send_headers = {
          'User-agent':'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)'
          }
    t1 = time.time()
    
    try:
    	req = urllib2.Request(url, headers=send_headers)
    	r = urllib2.urlopen(req, timeout=20)	
    	rehtml = r.read()
    	pos = rehtml.find(checkstr)
    except Exception, e:
    	pos = -1
    	print e
    t2 = time.time()	
    timeused = t2 - t1
    if (timeused < timeout and pos > 0):
       active = 1
    else:
       active = 0    
    update_array.append([ip, port, active, timeused]) 


def check_all_proxy(threadCount, checkmothed='http'):
    global check_in_one_call, skip_check_in_hour, cursor
    threads = []        
    cursor.execute('select ip,port,method from proxy where active=0')
    rows = cursor.fetchall()  

    check_in_one_call = len(rows)
    
    # 计算每个线程将要检查的代理个数
    if len(rows) >= threadCount:
        num_in_one_thread = len(rows) / threadCount   
    else:
        num_in_one_thread = 1

    threadCount = threadCount + 1
    print u"现在开始验证以下代理服务器....."
    for index in range(1, threadCount):        
     # 分配每个线程要检查的checklist,并把那些剩余任务留给最后一个线程               
        checklist = rows[(index - 1) * num_in_one_thread:index * num_in_one_thread]     
        if (index + 1 == threadCount):              
            checklist = rows[(index - 1) * num_in_one_thread:]

        t = TEST('', index, checklist, checkmothed)
        t.setDaemon(True)
        t.start()
        threads.append((t))
    for thread in threads:
        thread.join(60)        
    update_proxies()  # 把所有的检查结果更新到数据库
    

def get_proxy_one_website(index):
    global proxy_array
    func = 'build_list_urls_' + str(index)
    parse_func = eval('parse_page_' + str(index))
    urls = eval(func + '()')
    for url in urls:
        print url
        html = get_html(url)
        proxylist = parse_func(html)
        for proxy in proxylist:
            ip = string.strip(proxy[0])
            port = string.strip(proxy[1])
            if (re.compile("^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$").search(ip)):
                method = str(proxy[2])
                type = str(proxy[3])
                area = string.strip(proxy[4])
                proxy_array.append([ip, port, method, type, area])


def get_all_proxies(webindex='0'):
    global web_site_count, cursor 
    threads = []
    if webindex == '0':  
      print u"现在开始从以下" + str(web_site_count) + u"个网站抓取代理列表...."      
      count = web_site_count + 1
      for index in range(1, count):
          t = TEST('getproxy', index)
          t.setDaemon(True)
          t.start()
          threads.append((t))
      for thread in threads:
          thread.join(60)
      add_proxies_to_db()
    else:
      print u"现在开始从以下第" + webindex + u"个网站抓取代理列表...."      
      t = TEST('getproxy', webindex)
      t.setDaemon(True)
      t.start()
      threads.append((t))
      for thread in threads:
          thread.join(60)             
    

def add_proxies_to_db():
    global proxy_array
    count = len(proxy_array)
    for i in range(count):
        item = proxy_array[i]
        cursor.execute("select ip from proxy where ip='" + item[0] + "'")
        iplist = cursor.fetchall()
        if len(iplist) == 0:
          sql = """insert into `proxy` (`ip`,`port`,`method`,`type`,`intime`,`area`) values
          ('""" + item[0] + "'," + item[1] + "," + item[2] + "," + item[3] + ",now(),'" + clean_string(item[4]) + "')"        
          try:
            cursor.execute(sql)
          except Exception, e:
            print e


def update_proxies():
    global update_array
    for item in update_array:
        sql = '''
             update `proxy` set `checktime`=now(), 
                `active`=%(active)01d, 
                 `speed`=%(speed)02.3f                 
                 where `ip`='%(ip)01s' and `port`=%(port)01d                            
            ''' % {'active':item[2], 'speed':item[3], 'ip':item[0], 'port':item[1]}
        try:
            cursor.execute(sql)    
        except:
            pass 

def clean_string(s):
    tmp = re.sub(r"['\,\s\\\/]", ' ', s)
    return re.sub(r"\s+", ' ', tmp)


def open_database():
    global conn, cursor, day_keep, db_host, db_port, db_user, db_passwd, db_database, db_charset, webindex    

    try:
      conn = MySQLdb.connect(host=db_host, port=db_port, user=db_user, passwd=db_passwd, db=db_database, charset=db_charset)
      cursor = conn.cursor()
    except:
      print u"连接数据库失败"
      sys.exit()  
    if webindex == '0':  # 真正抓取网页的时候
      cursor.execute("delete from proxy where active=0 and checktime is not null")  # 清理检测未通过的代理


def close_database():
    global cursor, conn
    cursor.close()
    conn.close()

if __name__ == '__main__':
    if len(sys.argv) <= 1:
      usage()
      sys.exit()
    grab = False  # 是否抓取网页
    check = False  # 是否检测代理
    try:
      opts, args = getopt.getopt(sys.argv[1:], "hgc:t:")
    except:
      usage()
      sys.exit()
    for op, value in opts:
      if op == "-h":
        usage()
        sys.exit()
      elif op == "-g":
       webindex = '0'
       grab = True
      elif op == '-c':
       if value not in ['http', 'connect']:
         usage()
         sys.exit()
       webindex = '0'
       check = True
       checkmethod = value
      elif op == "-t":
       webindex = value
       grab = True
      else:
        usage()
        sys.exit()      
    open_database()
    if grab == True:
      get_all_proxies(webindex)
    if check == True:
      check_all_proxy(thread_num, checkmethod)
    close_database()
    print u"所有工作已经完成"
